{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "d4142e32",
   "metadata": {},
   "outputs": [],
   "source": [
    "# !wget https://huggingface.co/datasets/mesolitica/Malaysian-STT-Whisper/resolve/main/extra.zip\n",
    "# !unzip extra.zip"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "297940f9",
   "metadata": {},
   "outputs": [],
   "source": [
    "# !wget https://huggingface.co/datasets/mesolitica/Malaysian-STT-Whisper/resolve/main/data/extra-00000-of-00001.parquet\n",
    "# !wget https://huggingface.co/datasets/mesolitica/Malaysian-STT-Whisper/resolve/main/data/science_context-00000-of-00001.parquet"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "dce504f1",
   "metadata": {},
   "outputs": [],
   "source": [
    "# !wget https://huggingface.co/datasets/mesolitica/pseudolabel-science-large-v3-timestamp/resolve/main/check-same-science-context.jsonl"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "e83cbbf1",
   "metadata": {},
   "outputs": [],
   "source": [
    "word = [\n",
    "    'audio to Whisper ASR format word timestamp',\n",
    "    'transcribe the audio into Whisper format in word timestamp'\n",
    "]\n",
    "\n",
    "srt = [\n",
    "    'audio to SRT format',\n",
    "    'transcribe the audio into srt format',\n",
    "]\n",
    "\n",
    "ttml = [\n",
    "    'audio to TTML format',\n",
    "    'transcribe the audio into ttml format',\n",
    "]\n",
    "\n",
    "segment = [\n",
    "    'audio to Whisper ASR format',\n",
    "    'transcribe the audio into Whisper format'\n",
    "]\n",
    "\n",
    "transcribe = ['transcribe the audio']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "29fc4120",
   "metadata": {},
   "outputs": [],
   "source": [
    "import re\n",
    "import pandas as pd\n",
    "import random\n",
    "import os\n",
    "import json\n",
    "from tqdm import tqdm\n",
    "from glob import glob\n",
    "from xml.etree.ElementTree import Element, SubElement, tostring\n",
    "import xml.dom.minidom\n",
    "\n",
    "pattern = re.compile(r'<\\|([\\d.]+)\\|>([^<]*)')\n",
    "\n",
    "def format_srt_time(seconds):\n",
    "    hours = int(seconds // 3600)\n",
    "    minutes = int((seconds % 3600) // 60)\n",
    "    secs = int(seconds % 60)\n",
    "    millis = int((seconds - int(seconds)) * 1000)\n",
    "    return f\"{hours:02}:{minutes:02}:{secs:02},{millis:03}\"\n",
    "\n",
    "def get_subtitles(input_text):\n",
    "    entries = pattern.findall(input_text)\n",
    "\n",
    "    subtitles = []\n",
    "    for i in range(len(entries) - 1):\n",
    "        start_time = float(entries[i][0])\n",
    "        end_time = float(entries[i + 1][0])\n",
    "        text = entries[i][1].strip()\n",
    "        if text:\n",
    "            subtitles.append((start_time, end_time, text))\n",
    "            \n",
    "    return subtitles\n",
    "\n",
    "def get_srt(subtitles):\n",
    "    srt_output = \"\"\n",
    "    for idx, (start, end, text) in enumerate(subtitles, start=1):\n",
    "        srt_output += f\"{idx}\\n\"\n",
    "        srt_output += f\"{format_srt_time(start)} --> {format_srt_time(end)}\\n\"\n",
    "        srt_output += f\"{text}\\n\\n\"\n",
    "    return srt_output\n",
    "\n",
    "def format_ttml_time(seconds):\n",
    "    return f\"{int(seconds // 3600):02}:{int((seconds % 3600) // 60):02}:{int(seconds % 60):02}.{int((seconds % 1) * 1000):03}\"\n",
    "\n",
    "def get_tt(subtitles):\n",
    "    tt = Element('tt', xmlns=\"http://www.w3.org/ns/ttml\")\n",
    "    body = SubElement(tt, 'body')\n",
    "    div = SubElement(body, 'div')\n",
    "\n",
    "    for start, end, text in subtitles:\n",
    "        SubElement(div, 'p', begin=format_ttml_time(start), end=format_ttml_time(end)).text = text\n",
    "\n",
    "    xml_str = xml.dom.minidom.parseString(tostring(tt)).toprettyxml(indent=\"  \")\n",
    "    return str(xml_str)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "f4ed2acc",
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "df = pd.read_parquet('extra-00000-of-00001.parquet')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "824fa3ff",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "30984"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "extra = []\n",
    "\n",
    "for i in range(len(df)):\n",
    "    if not os.path.exists(df['audio_filename'].iloc[i]):\n",
    "        continue\n",
    "    t = df['segment_timestamp'].iloc[i]\n",
    "    extra.append({\n",
    "        'question': random.choice(segment),\n",
    "        'answer': t,\n",
    "        'audio_filename': df['audio_filename'].iloc[i],\n",
    "    })\n",
    "    \n",
    "    extra.append({\n",
    "        'question': random.choice(transcribe),\n",
    "        'answer': re.sub(r\"<\\|.*?\\|>\", \"\", t).strip(),\n",
    "        'audio_filename': df['audio_filename'].iloc[i],\n",
    "    })\n",
    "    \n",
    "    try:\n",
    "        input_text = t.split('<|transcribe|>')[1]\n",
    "        subtitles = get_subtitles(input_text)\n",
    "    except:\n",
    "        continue\n",
    "        \n",
    "    try:\n",
    "        extra.append({\n",
    "            'question': random.choice(srt),\n",
    "            'answer': get_srt(subtitles),\n",
    "            'audio_filename': df['audio_filename'].iloc[i],\n",
    "        })\n",
    "    except:\n",
    "        pass\n",
    "    \n",
    "    try:\n",
    "        extra.append({\n",
    "            'question': random.choice(ttml),\n",
    "            'answer': get_tt(subtitles),\n",
    "            'audio_filename': df['audio_filename'].iloc[i],\n",
    "        })\n",
    "    except Exception as e:\n",
    "        print(e)\n",
    "        pass\n",
    "    \n",
    "len(extra)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "be73ed6b",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'question': 'transcribe the audio into ttml format',\n",
       " 'answer': '<?xml version=\"1.0\" ?>\\n<tt xmlns=\"http://www.w3.org/ns/ttml\">\\n  <body>\\n    <div>\\n      <p begin=\"00:00:01.219\" end=\"00:00:05.839\">Pada pendapat kitak nak, seronok sik permainan congkak tok?</p>\\n      <p begin=\"00:00:06.839\" end=\"00:00:12.519\">Kamek rasa nya best sebab aura persaingan nya sangatlah kuat.</p>\\n    </div>\\n  </body>\\n</tt>\\n',\n",
       " 'audio_filename': 'audio/SM_FF_CONGKAK_001-6.mp3'}"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "extra[-1]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "feb239e5",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>audio_filename</th>\n",
       "      <th>word_timestamp</th>\n",
       "      <th>segment_timestamp</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>prepared-science-chunks/0-1.mp3</td>\n",
       "      <td>&lt;|startoftranscript|&gt;&lt;|en|&gt;&lt;|transcribeprecise...</td>\n",
       "      <td>&lt;|startoftranscript|&gt;&lt;|en|&gt;&lt;|transcribe|&gt;&lt;|0.0...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>prepared-science-chunks/2-0.mp3</td>\n",
       "      <td>&lt;|startoftranscript|&gt;&lt;|en|&gt;&lt;|transcribeprecise...</td>\n",
       "      <td>&lt;|startoftranscript|&gt;&lt;|en|&gt;&lt;|transcribe|&gt;&lt;|0.0...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>prepared-science-chunks/2-1.mp3</td>\n",
       "      <td>&lt;|startoftranscript|&gt;&lt;|en|&gt;&lt;|transcribeprecise...</td>\n",
       "      <td>&lt;|startoftranscript|&gt;&lt;|en|&gt;&lt;|transcribe|&gt;&lt;|0.0...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>prepared-science-chunks/3-0.mp3</td>\n",
       "      <td>&lt;|startoftranscript|&gt;&lt;|en|&gt;&lt;|transcribeprecise...</td>\n",
       "      <td>&lt;|startoftranscript|&gt;&lt;|en|&gt;&lt;|transcribe|&gt;&lt;|0.0...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>prepared-science-chunks/4-0.mp3</td>\n",
       "      <td>&lt;|startoftranscript|&gt;&lt;|en|&gt;&lt;|transcribeprecise...</td>\n",
       "      <td>&lt;|startoftranscript|&gt;&lt;|en|&gt;&lt;|transcribe|&gt;&lt;|0.0...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                    audio_filename  \\\n",
       "0  prepared-science-chunks/0-1.mp3   \n",
       "1  prepared-science-chunks/2-0.mp3   \n",
       "2  prepared-science-chunks/2-1.mp3   \n",
       "3  prepared-science-chunks/3-0.mp3   \n",
       "4  prepared-science-chunks/4-0.mp3   \n",
       "\n",
       "                                      word_timestamp  \\\n",
       "0  <|startoftranscript|><|en|><|transcribeprecise...   \n",
       "1  <|startoftranscript|><|en|><|transcribeprecise...   \n",
       "2  <|startoftranscript|><|en|><|transcribeprecise...   \n",
       "3  <|startoftranscript|><|en|><|transcribeprecise...   \n",
       "4  <|startoftranscript|><|en|><|transcribeprecise...   \n",
       "\n",
       "                                   segment_timestamp  \n",
       "0  <|startoftranscript|><|en|><|transcribe|><|0.0...  \n",
       "1  <|startoftranscript|><|en|><|transcribe|><|0.0...  \n",
       "2  <|startoftranscript|><|en|><|transcribe|><|0.0...  \n",
       "3  <|startoftranscript|><|en|><|transcribe|><|0.0...  \n",
       "4  <|startoftranscript|><|en|><|transcribe|><|0.0...  "
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df = pd.read_parquet('science_context-00000-of-00001.parquet')\n",
    "df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "id": "a2c2711a",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "849147it [00:48, 17379.36it/s]\n"
     ]
    }
   ],
   "source": [
    "from collections import Counter\n",
    "\n",
    "def detect_repeated_phrases(text, n=3):\n",
    "    words = text.lower().split()\n",
    "    phrases = [' '.join(words[i:i+n]) for i in range(len(words)-n+1)]\n",
    "    counter = Counter(phrases)\n",
    "    return [phrase for phrase, count in counter.items() if count > 2]\n",
    "\n",
    "science = []\n",
    "\n",
    "with open('check-same-science-context.jsonl') as fopen:\n",
    "    for l in tqdm(fopen):\n",
    "        l = json.loads(l)\n",
    "        audio_filename = l['audio_filename']\n",
    "        if not os.path.exists(audio_filename):\n",
    "            continue\n",
    "        \n",
    "        t = l['new_text']\n",
    "        t_ = re.sub(r\"<\\|.*?\\|>\", \"\", t).strip()\n",
    "        if len(t_.split()) < 10:\n",
    "            continue\n",
    "            \n",
    "        if len(detect_repeated_phrases(t_, n=3)):\n",
    "            continue\n",
    "        \n",
    "        if random.random() > 0.2:\n",
    "            continue\n",
    "        \n",
    "        science.append({\n",
    "            'question': random.choice(segment),\n",
    "            'answer': t,\n",
    "            'audio_filename': audio_filename,\n",
    "        })\n",
    "\n",
    "        science.append({\n",
    "            'question': random.choice(transcribe),\n",
    "            'answer': t_,\n",
    "            'audio_filename': audio_filename,\n",
    "        })\n",
    "        \n",
    "        try:\n",
    "            input_text = t.split('<|transcribe|>')[1]\n",
    "            subtitles = get_subtitles(input_text)\n",
    "        except Exception as e:\n",
    "            print(e)\n",
    "            continue\n",
    "        \n",
    "        if random.random() > 0.7:\n",
    "            try:\n",
    "                science.append({\n",
    "                    'question': random.choice(srt),\n",
    "                    'answer': get_srt(subtitles),\n",
    "                    'audio_filename': df['audio_filename'].iloc[i],\n",
    "                })\n",
    "            except:\n",
    "                pass\n",
    "        \n",
    "        if random.random() > 0.7:\n",
    "            try:\n",
    "                science.append({\n",
    "                    'question': random.choice(ttml),\n",
    "                    'answer': get_tt(subtitles),\n",
    "                    'audio_filename': df['audio_filename'].iloc[i],\n",
    "                })\n",
    "            except Exception as e:\n",
    "                print(e)\n",
    "                pass"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "id": "95fba506",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "402124"
      ]
     },
     "execution_count": 19,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(science)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "id": "f02503bb",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[{'question': 'transcribe the audio into Whisper format',\n",
       "  'answer': \"<|startoftranscript|><|en|><|transcribe|><|0.00|> significant number of people in this world who also has experienced lost by way of a missing<|6.36|><|6.36|> person. And that is what happened with my family. So in 1987, my sister, Anita Wiley, disappeared<|14.84|><|14.84|> from Detroit really without a trace. And what makes the story a little more complicated is that<|21.26|><|21.26|> I never knew about Anita. She was a half-sister on my father's side. And I learned about her<|28.00|><|28.00|><|endoftext|>\",\n",
       "  'audio_filename': 'chunk/mp3-16k-0-0_003.mp3'},\n",
       " {'question': 'transcribe the audio',\n",
       "  'answer': \"significant number of people in this world who also has experienced lost by way of a missing person. And that is what happened with my family. So in 1987, my sister, Anita Wiley, disappeared from Detroit really without a trace. And what makes the story a little more complicated is that I never knew about Anita. She was a half-sister on my father's side. And I learned about her\",\n",
       "  'audio_filename': 'chunk/mp3-16k-0-0_003.mp3'},\n",
       " {'question': 'audio to Whisper ASR format',\n",
       "  'answer': \"<|startoftranscript|><|en|><|transcribe|><|0.00|> much because I love Detroit because Detroit is me. It is where I was born. It was where I was<|9.92|><|9.92|> raised. And what has been beautiful about being a journalist is really when I was 18, I moved away<|16.94|><|16.94|> from home to go to college. And I spent a short amount of time at home after college, but I've<|22.84|><|22.84|> been away from Detroit longer than I was there. I realized that recently. But because it was my<|28.90|><|28.90|><|endoftext|>\",\n",
       "  'audio_filename': 'chunk/mp3-16k-0-0_006.mp3'},\n",
       " {'question': 'transcribe the audio',\n",
       "  'answer': \"much because I love Detroit because Detroit is me. It is where I was born. It was where I was raised. And what has been beautiful about being a journalist is really when I was 18, I moved away from home to go to college. And I spent a short amount of time at home after college, but I've been away from Detroit longer than I was there. I realized that recently. But because it was my\",\n",
       "  'audio_filename': 'chunk/mp3-16k-0-0_006.mp3'},\n",
       " {'question': 'audio to SRT format',\n",
       "  'answer': \"1\\n00:00:00,000 --> 00:00:09,919\\nmuch because I love Detroit because Detroit is me. It is where I was born. It was where I was\\n\\n2\\n00:00:09,919 --> 00:00:16,940\\nraised. And what has been beautiful about being a journalist is really when I was 18, I moved away\\n\\n3\\n00:00:16,940 --> 00:00:22,839\\nfrom home to go to college. And I spent a short amount of time at home after college, but I've\\n\\n4\\n00:00:22,839 --> 00:00:28,899\\nbeen away from Detroit longer than I was there. I realized that recently. But because it was my\\n\\n\",\n",
       "  'audio_filename': 'prepared-science-chunks/4201-2.mp3'},\n",
       " {'question': 'transcribe the audio into Whisper format',\n",
       "  'answer': '<|startoftranscript|><|en|><|transcribe|><|0.00|> And I have had the pleasure as a journalist living, I think, maybe seven or eight states.<|6.34|><|6.60|> And I have lived in communities all throughout the country.<|10.86|><|11.32|> And what is reflected for me is what is so common about the human experience.<|16.50|><|16.96|> Some of the same things I see all across the country.<|20.00|><|20.00|> I experienced in Detroit and I see that in Detroit.<|23.20|><|24.56|> And I see Detroit all over the world and the different places that I lived and reported on.<|29.96|><|endoftext|>',\n",
       "  'audio_filename': 'chunk/mp3-16k-0-0_007.mp3'},\n",
       " {'question': 'transcribe the audio',\n",
       "  'answer': 'And I have had the pleasure as a journalist living, I think, maybe seven or eight states. And I have lived in communities all throughout the country. And what is reflected for me is what is so common about the human experience. Some of the same things I see all across the country. I experienced in Detroit and I see that in Detroit. And I see Detroit all over the world and the different places that I lived and reported on.',\n",
       "  'audio_filename': 'chunk/mp3-16k-0-0_007.mp3'},\n",
       " {'question': 'audio to Whisper ASR format',\n",
       "  'answer': '<|startoftranscript|><|en|><|transcribe|><|0.00|> And one of the things that I have always felt very passionately about, even as a young person,<|7.12|><|7.12|> when I knew I wanted to be a journalist, was to be able to tell the stories of Detroit<|11.50|><|11.50|> and tell them in a complete way, in a more holistic way, because I knew how Detroit was portrayed.<|18.08|><|18.48|> I knew how the world saw Detroit as a city on the decline, a city that was bankrupt.<|24.92|><|25.16|><|endoftext|>',\n",
       "  'audio_filename': 'chunk/mp3-16k-0-0_008.mp3'},\n",
       " {'question': 'transcribe the audio',\n",
       "  'answer': 'And one of the things that I have always felt very passionately about, even as a young person, when I knew I wanted to be a journalist, was to be able to tell the stories of Detroit and tell them in a complete way, in a more holistic way, because I knew how Detroit was portrayed. I knew how the world saw Detroit as a city on the decline, a city that was bankrupt.',\n",
       "  'audio_filename': 'chunk/mp3-16k-0-0_008.mp3'},\n",
       " {'question': 'transcribe the audio into srt format',\n",
       "  'answer': '1\\n00:00:00,000 --> 00:00:07,120\\nAnd one of the things that I have always felt very passionately about, even as a young person,\\n\\n2\\n00:00:07,120 --> 00:00:11,500\\nwhen I knew I wanted to be a journalist, was to be able to tell the stories of Detroit\\n\\n3\\n00:00:11,500 --> 00:00:18,079\\nand tell them in a complete way, in a more holistic way, because I knew how Detroit was portrayed.\\n\\n4\\n00:00:18,480 --> 00:00:24,920\\nI knew how the world saw Detroit as a city on the decline, a city that was bankrupt.\\n\\n',\n",
       "  'audio_filename': 'prepared-science-chunks/4201-2.mp3'}]"
      ]
     },
     "execution_count": 21,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "science[:10]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "id": "f6e2ea03",
   "metadata": {},
   "outputs": [],
   "source": [
    "pd.DataFrame(extra).to_parquet('extra-stt-extra.parquet')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "id": "63e384d9",
   "metadata": {},
   "outputs": [],
   "source": [
    "pd.DataFrame(science).to_parquet('extra-stt-science.parquet')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "id": "58f92100",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Uploading files using Xet Storage..\n",
      "Uploading...: 100%|███████████████████████████| 931k/931k [00:05<00:00, 158kB/s]\n",
      "https://huggingface.co/datasets/mesolitica/Transcription-Instructions/blob/main//data/extra-00000-of-00001.parquet\n"
     ]
    }
   ],
   "source": [
    "!huggingface-cli upload mesolitica/Transcription-Instructions \\\n",
    "extra-stt-extra.parquet /data/extra-00000-of-00001.parquet --repo-type=dataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "id": "1a7ec998",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Uploading files using Xet Storage..\n",
      "Uploading...: 100%|████████████████████████| 77.2M/77.2M [00:12<00:00, 6.37MB/s]\n",
      "https://huggingface.co/datasets/mesolitica/Transcription-Instructions/blob/main//data/science-00000-of-00001.parquet\n"
     ]
    }
   ],
   "source": [
    "!huggingface-cli upload mesolitica/Transcription-Instructions \\\n",
    "extra-stt-science.parquet /data/science-00000-of-00001.parquet --repo-type=dataset"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.12"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
